Data source

Bustos Roman, M. (2022). Steam Games Dataset [Data set]. Kaggle. https://doi.org/10.34740/KAGGLE/DS/2109585

Load Libraries

Extract data from JSON file

The JSON has a nested structure, where each game’s data is in a list that is represented by the game id (appID).

The tags feature contains value that resembles a dictionary/table containing tags chosen by users as keys and its tag frequency as value for each game, I will create a seperate dataframe for this feature for further analysis.

# import raw data and combine them

# Initialize an empty dataframe to store game data
games_list <- list()
tags_list <- list()

# Read the JSON dataset
dataset <- fromJSON("games.json")

# Iterate over each appID in the dataset and store the game data within `appID` into a list
for (app in names(dataset)) {
  game <- dataset[[app]]
  
  # Collect row data
  game_info <- data.frame(
    appID                = app,
    name                 = coalesce(as.character(game$name), NA),
    releaseDate          = coalesce(as.Date(game$release_date, format = "%b %d, %Y"), NA),
    estimatedOwners      = coalesce(as.factor(game$estimated_owners), NA),
    required_age         = coalesce(as.numeric(game$required_age), 0),
    price                = coalesce(as.numeric(game$price), 0.0),
    dlcCount             = coalesce(as.numeric(game$dlc_count), 0),
    languages            = coalesce(paste(as.character(game$supported_languages), collapse = ", "), NA),
    fullAudioLanguages   = coalesce(paste(as.character(game$full_audio_languages), collapse = ", "), NA),
    positive             = coalesce(as.numeric(game$positive), 0),
    negative             = coalesce(as.numeric(game$negative), 0),
    achievements         = coalesce(as.numeric(game$achievements), 0),
    recommendations      = coalesce(as.numeric(game$recommendations), 0),
    averagePlaytime      = coalesce(as.numeric(game$average_playtime_forever), 0),
    medianPlaytime       = coalesce(as.numeric(game$median_playtime_forever), 0),
    developers           = coalesce(paste(as.character(game$developers), collapse = ", "), NA),
    publishers           = coalesce(paste(as.character(game$publishers), collapse = ", "), NA),
    genres               = coalesce(paste(as.character(game$genres), collapse = ", "), NA)
  )
  
  # some other excluded features
    #supportWindows       = coalesce(as.logical(game$windows), FALSE),
    #supportMac           = coalesce(as.logical(game$mac), FALSE),
    #supportLinux         = coalesce(as.logical(game$linux), FALSE),
    #metacriticScore      = coalesce(as.numeric(game$metacritic_score), 0),
    #userScore            = coalesce(as.numeric(game$user_score), 0),
  
  # Append the game info to the games list
  games_list[[length(games_list) + 1]] <- game_info
  
  # Extract tags info into a separate tags dataframe
  if (!is.null(game$tags) && length(names(game$tags)) > 0 && length(game$tags) > 0)  {
    tags_info <- data.frame(
      appID     = app,
      tag       = names(game$tags),
      frequency = as.numeric(game$tags),
      stringsAsFactors = FALSE
    )
  } else {  
    # Return NA for no tags
    tags_info <- data.frame(
      appID     = app,
      tag       = NA,
      frequency = NA,
      stringsAsFactors = FALSE
    )
  }
  # append the tags data into the tags list
  tags_list[[length(tags_list) + 1]] <- tags_info
}

# combine the lists of rows into a dataframe
games_df <- do.call(rbind, games_list)
tags_df <- do.call(rbind, tags_list)

tags_df is in long format:

head(tags_df)
appID tag frequency
20200 Indie 22
20200 Casual 21
20200 Sports 21
20200 Bowling 6
655370 Indie 109
655370 Action 103

Data processing

Since I have seperated the game data set into two dataframes, I will perform data cleaning and preprocessing on these dataframes seperately, and then keep only the games that remained on both dataframes before performing machine learning tasks.

games_df EDA and data cleaning

games_df <- games_df %>%
  mutate(across(where(is.character), ~ gsub("^\\s+|\\s+$", "", .))) %>%
  mutate(across(where(is.character), ~ iconv(., from = "UTF-8", to = "ASCII//TRANSLIT", sub = ""))) # Remove invalid or non-translatable characters

There are a total of 23 features in the games dataframe (excluding appID):

skim(games_df)
Data summary
Name games_df
Number of rows 97410
Number of columns 18
_______________________
Column type frequency:
character 7
Date 1
factor 1
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
appID 0 1 2 7 0 97410 0
name 0 1 0 207 7 94914 0
languages 0 1 0 1010 4831 12862 0
fullAudioLanguages 0 1 0 1010 57017 2500 0
developers 0 1 0 624 4873 55234 0
publishers 0 1 0 164 5099 48395 0
genres 0 1 0 254 4841 2843 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
releaseDate 131 1 1997-06-30 2025-04-14 2021-07-09 4631

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
estimatedOwners 0 1 FALSE 14 0 -: 61433, 0 -: 17263, 200: 8034, 500: 3978

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
required_age 0 1 0.28 2.14 0 0.00 0.00 0.00 21.00 ▇▁▁▁▁
price 0 1 7.09 12.39 0 0.99 4.19 9.99 999.98 ▇▁▁▁▁
dlcCount 0 1 0.49 12.83 0 0.00 0.00 0.00 2366.00 ▇▁▁▁▁
positive 0 1 848.94 22870.11 0 0.00 5.00 35.00 5764420.00 ▇▁▁▁▁
negative 0 1 141.07 4278.02 0 0.00 1.00 10.00 895978.00 ▇▁▁▁▁
achievements 0 1 18.56 160.41 0 0.00 0.00 17.00 9821.00 ▇▁▁▁▁
recommendations 0 1 690.51 16817.17 0 0.00 0.00 0.00 3441592.00 ▇▁▁▁▁
averagePlaytime 0 1 91.80 1068.49 0 0.00 0.00 0.00 145727.00 ▇▁▁▁▁
medianPlaytime 0 1 81.85 1412.50 0 0.00 0.00 0.00 208473.00 ▇▁▁▁▁

The features genres, developers, publishers, languages, fullAudioLanguages, are all in character datatype, these features stores different amount of character elements within a single string in comma separated style, since the study would not focus on text processing, I will convert these features into a count variable.

Below defines a function to count amount of comma separated elements inside a string.

# Define the function
count_values <- function(string) {
  # Check if the input is not empty
  if (nchar(string) == 0) {
    return(0)  # Return 0 if the string is empty
  }
  
  # Split the string by commas and count the elements
  # Trim whitespace and count non-empty elements
  counts <- strsplit(string, ", ")[[1]]
  count <- length(counts) # Count non-empty trimmed elements
  
  return(count)
}

# Apply the function for developers, publishers, genres, and categories column
games_df <- games_df %>%
  mutate(developerCount = sapply(developers, count_values)) %>%
  mutate(publisherCount = sapply(publishers, count_values)) %>%
  mutate(genresCount = sapply(genres, count_values)) %>%
  mutate(languagesCount = sapply(languages, count_values)) %>%
  mutate(fullAudioLanguagesCount = sapply(fullAudioLanguages, count_values))

To ensure only historically active and non-free games are included, games that are free or with no ratings or recommendations or estimatedOwners = ‘0 - 20000’, or averagePlaytime = 0, will be excluded as they are irrelevant data points. Games with releaseDate > ‘2024-07-01’ will also be excluded, because these games are released too recently and may not contain sufficient data for generalization.

games_df.filtered <- games_df %>% filter(recommendations > 0 & estimatedOwners != '0 - 0' & estimatedOwners != '0 - 20000' & averagePlaytime > 0 & (positive + negative) !=0 & releaseDate < "2024-07-01" & price != 0)

dim(games_df.filtered)
## [1] 8010   23

averagePlaytime&medianPlaytime

These features contains info on how much total minutes is spent on a game by each player on average for a given game. Median means that half of all users played equal or less than this amount of minutes.

summary(games_df.filtered$averagePlaytime)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      1.0    134.0    261.0    706.4    591.8 145727.0
summary(games_df.filtered$medianPlaytime)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      1.0    133.2    253.0    629.3    510.8 208473.0

“For playtime median is more useful, as average gets skewed by hardcore players for paid games and people that dropped the game after tutorial for free-to-play games.”

https://galyonk.in/steam-spy-the-missing-manual-cc22ef6eebe1 (Sergiy Garlyonkin’s blog)

Relative Difference ((Mean - Median) / Mean)

A traditional skewness measure cannot be calculated since the standard deviation of the distribution of total playtime for a game is not provided, however, I will use the relative difference between the mean and median as an estimated measure of skewness:

games_df.filtered <- games_df.filtered %>% 
  mutate(estimatedSkew = (averagePlaytime - medianPlaytime)/ averagePlaytime) %>%
  mutate(playtimeRatio = averagePlaytime / medianPlaytime)

hist(games_df.filtered$estimatedSkew)

There are many games with the exact same median and average playtime, it is very rare to see a perfectly symmetrical distribution for total gametime minutes, however, in cases of very small sample sizes for a given game (e.g., just one or two observations), the mean and median playtime can easily equal to each other, resulting in zero for this measure, but this implies that the data collected for this game is not accurate as the player data is insufficient. I will filter out games where estimatedSkew == 0.

games_df.filtered <- games_df.filtered %>% filter(estimatedSkew != 0)
dim(games_df.filtered)
## [1] 5930   25
hist(games_df.filtered$estimatedSkew, main = 'Distribution of estimated skew', xlab = 'estimatedSkew', breaks = 30)

plot(x = games_df.filtered$estimatedSkew, y = log(games_df.filtered$recommendations + 1), pch = 20,
     cex = 0.8, xlab = "Estimated Skew", ylab = "Log(Recommendations)")

There indicates some positive relationship between estimated skew and the log-transformed recommendations amount.

Transforming minutes into hours and days.

games_df.filtered <- games_df.filtered %>%
  mutate(averagePlayHours = averagePlaytime / 60) %>%
  mutate(medianPlayHours = medianPlaytime / 60) %>%
  mutate(averagePlayDays = averagePlayHours / 24) %>%
  mutate(medianPlayDays = medianPlayHours / 24)

summary(games_df.filtered$averagePlayHours)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0333    2.7500    4.9500   13.2060   11.3833 1737.3000
summary(games_df.filtered$averagePlayDays)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00139  0.11458  0.20625  0.55025  0.47431 72.38750

Playtime

hist(games_df.filtered$medianPlaytime, breaks = 50)

The medianPlaytime appears to have a highly skewed distribution, I will perform log transformation for this feature.

games_df.filtered <- games_df.filtered %>% mutate(logMedianPlaytime = log(1 + medianPlaytime))

hist(log(games_df.filtered$logMedianPlaytime),
     breaks = 40,
     main = "Log Median Playtime distribution",
     xlab = "Log Median Playtime")

aboveTwoHours indicator

One of the most critical aspects of the 2-hour mark is that it is the threshold for Steam’s refund policy. Players can request a refund for games they have played for less than 2 hours, which incentivizes developers to create games that players find engaging enough to exceed this playtime. This can lead to a higher likelihood of players keeping the game if they enjoy it beyond the initial hours. I will be creating a indicator for games that have median playtime exceeding 2 hours. Source

games_df.filtered <- games_df.filtered %>%
  mutate(aboveTwoHours = ifelse(medianPlaytime > 120, 1, 0))

table(games_df.filtered$aboveTwoHours)
## 
##    0    1 
##  938 4992

genres, publishers, and developers variables each contains a comma separated list of the respective attributes stored in a string. I defined a helper function to count unique values within a string list feature of the dataframe.

count_feature <- function(df, string_feature){
  # defining a function to count unique values within a string list feature of the dataframe
  # e.g. "Casual, Indie, Sports", "..."
  # outputs a table of unique values within the string list and its frequency.
  
  # Split the string by comma, resulting in a single list of lists of strings.
  split_features <- strsplit(df[,string_feature], ", ")  # Split on comma
  
  # Unlist/flatten the lists into a single vector
  all_features <- unlist(split_features)
  
  #  create frequency table to count how many times each unique genre appears
  feature_counts <- table(all_features)
  feature_counts_df <- as.data.frame(feature_counts)
  colnames(feature_counts_df) <- c(string_feature, "Frequency")
  feature_df <- feature_counts_df %>% 
                arrange(desc(Frequency))
  feature_df
}

genres feature

Contains broad categories that define a game’s core mechanics and style of play.

head(count_feature(games_df.filtered, 'genres'),20)
genres Frequency
Indie 3570
Action 2904
Adventure 2375
Strategy 1442
RPG 1382
Simulation 1361
Casual 1352
Early Access 378
Racing 228
Sports 208
Massively Multiplayer 128
Utilities 39
Design & Illustration 28
Animation & Modeling 17
Violent 16
Web Publishing 14
Free to Play 11
Education 9
Gore 9
Video Production 9

Although most of the game genres are related to games, some of them refers to either production software or other non-game related software.

I am removing these unrelated genres because this project will focus on playable video games and will not include non-gaming related applications, the games with non-gaming related genres also only accounts to less than 10% of all other genres.

The following code reviews the non-gaming related genres first.

# Filter games with non-game related genres
exclude_genres = c('Audio Production',
                  'Game Development',
                  'Design & Illustration',
                  'Utilities',
                  'Software Training',
                  'Photo Editing',
                  'Accounting',
                  'Animation & Modeling',
                  'Web Publishing',
                  'Video Production',
                  'Education',
                  'Movie',
                  'Short',
                  '360 Video',
                  'Documentary',
                  'Episodic',
                  'Tutorial')

# Split the genres into a list for filtering
genres_split <- strsplit(as.character(games_df.filtered$genres), ", ")

# define function that returns TRUE if any of the element within a vector contains `exclude_genres`
check_genre <- function(x){
    any(x %in% exclude_genres)
  }

# sapply() applies the function to each element of games_df$genres_split, returning a logical vector
exclude_rows <- sapply(genres_split, check_genre)

games_df.filtered <- games_df.filtered[!exclude_rows,]

Barplot of genre frequency

count_feature(games_df.filtered,'genres') %>% 
  ggplot(aes(x = reorder(genres, Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +  # stat based on the values provided in the data frame.
  coord_flip() +                  
  labs(title = "Genre Frequency (Filtered)", x = "Genre", y = "Frequency")

age16

This feature indicates whether a game’s required age is 16+.

games_df.filtered <- games_df.filtered %>%
  mutate(age16 = ifelse(required_age > 16, TRUE, FALSE))

boxplot(price ~ age16  ,data = games_df.filtered)

boxplot(log(recommendations + 1) ~ age16, data = games_df.filtered, ylab = "log(recommendations)")

table(games_df.filtered$age16)
## 
## FALSE  TRUE 
##  5350   529

It could be seen that games with age requirement of 16+ are both more recommended and higher priced. But the class is heavily imbalanced, stratified sampling method would be needed when using this predictor for training.

###developers & publishers

These are the developers who developed the game, and publishers who are responsible for distribution, marketing and sales of a game.

str(games_df.filtered$developers)
##  chr [1:5879] "Pixelated Milk" "Team17 Digital Ltd" "Lonely Troops" ...

Distribution of developersCount,publishersCount,genresCount

hist(games_df.filtered$developerCount, main = "Developers Count")

hist(games_df.filtered$publisherCount, main = "Publishers Count")

hist(games_df.filtered$genresCount, main = "Genres Count")

hist(games_df.filtered$languagesCount, main = "Languages Count")

Most of the games have a single developer and publisher, with a combination of 2-3 genres.

Self Publishers

As self publishing a game, where the developer and the publisher are the same entity, is a very likely indicator of a game that is developed independently (Indie), identifying this characteristic may be a useful feature. I will be creating a indicator variable to indicate whether a game is self published, not including games with more than 1 developer or publisher.

games_df.filtered <- games_df.filtered %>%
  mutate(selfPublished = (developerCount == 1 & publisherCount == 1) & # only in the case where they work solo
           trimws(developers) == trimws(publishers))  # trim whitespace

table(games_df.filtered$selfPublished)
## 
## FALSE  TRUE 
##  3305  2574
boxplot(price ~ selfPublished, games_df.filtered)

head(count_feature(games_df.filtered,'publishers'),20) %>% 
  ggplot(aes(x = reorder(publishers, Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +  # stat based on the values provided in the data frame.
  coord_flip() +                  
  labs(title = "Publisher Frequency (Top 20)", x = "Category", y = "Frequency")

Frequency of developers and publishers in games with over 50,000 recommendations.

head(count_feature(games_df.filtered[games_df.filtered$recommendations >= 50000,],"developers"),15)
developers Frequency
Valve 8
Feral Interactive (Mac) 6
Ubisoft Montreal 6
Feral Interactive (Linux) 5
Ltd. 5
Aspyr (Mac) 4
Bethesda Game Studios 4
CAPCOM Co. 4
CD PROJEKT RED 4
Paradox Development Studio 4
Aspyr (Linux) 3
DICE 3
Firaxis Games 3
FromSoftware 3
Inc. 3
head(count_feature(games_df.filtered[games_df.filtered$recommendations >= 50000,],"publishers"),15)
publishers Frequency
Ubisoft 10
Bethesda Softworks 9
Electronic Arts 9
Valve 9
Xbox Game Studios 7
2K 6
Feral Interactive (Mac) 6
Paradox Interactive 6
Feral Interactive (Linux) 5
Square Enix 5
Aspyr (Mac) 4
CAPCOM Co. 4
CD PROJEKT RED 4
Ltd. 4
Aspyr (Linux) 3

industryLeader indicator

Among the lists of developers and publishers of games with over 50,000 recommendations, some big players in the gaming industry have showed up.

I have created a list of current industry leading game developer and publishers based on this list with some research and domain knowledge. This list consists of companies that are widely recognized and have more resources in both developing and publishing a game to maximize sales. A indicator variable will be created for whether the developers or publishers column contains a name from this list.

This helps the model in addressing games that may recieve a lot more recommendations simply because they were developed by industry leaders.

# Gaming leaders list
game_leaders <- c("Valve", "Bethesda Softworks","CD PROJEKT RED", "Xbox Game Studios", "2K", "CAPCOM Co.", "Bohemia Interactive","Facepunch Studios","THQ Nordic","Square Enix","Rockstar Games","Rockstar North","Rockstar Toronto", "Ubisoft", "Electronic Arts","Bethesda Game Studios", "Ubisoft Montreal","Game Science","DICE","Amazon Games","Eidos-Montreal","Snail Games USA","Klei Entertainment","Hello Games", "Gearbox Software","343 Industries","Techland","Bungie","FromSoftware","FromSoftware Inc.","BANDAI NAMCO Entertainment","SEGA","Capcom","KOEI TECMO GAMES CO", "LTD.","Konami Digital Entertainment","Warner Bros. Games", "Warner Bros. Interactive Entertainment","WB Games","Deep Silver","Activision","PlayStation PC LLC","Lucasfilm", "LucasArts", "Disney","Pearl Abyss","Tripwire Interactive")


game_leaders_regex <- paste(game_leaders, collapse = "|")

games_df.filtered$industryLeader <- grepl(game_leaders_regex, games_df.filtered$publishers, ignore.case = TRUE) |
                                grepl(game_leaders_regex, games_df.filtered$developers, ignore.case = TRUE)
table(games_df.filtered$industryLeader)
## 
## FALSE  TRUE 
##  4723  1156

Numeric features

recommendations

This feature contains the number of recommendations given out by players.

summary(games_df.filtered$recommendations)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     101     513    1396    8150    4313  899477
hist(games_df.filtered$recommendations)

log-transforemd recommendations:

games_df.filtered <- games_df.filtered %>% 
  mutate(logRecommendations = log(recommendations + 1))

hist(games_df.filtered$logRecommendations)

Games with recommendation count exceeding 100,000 (~exp(11.50)) are determined as extreme outliers within this dataset and are removed due to the limited data of such games, their inclusion could significantly impact model performance and potentially hinder accurate predictions for the majority of games with more typical recommendation counts.

games_df.filtered<- games_df.filtered %>% filter(recommendations < 100000)

hist(games_df.filtered$logRecommendations, main = "Histogram of log Recommendations", xlab = "log Recommendations", breaks = 30)

hist(games_df.filtered$recommendations, main = "Histogram of Recommendations", xlab = "Recommendations", breaks = 40)

boxplot(logRecommendations ~ aboveTwoHours, data = games_df.filtered)

boxplot(logRecommendations ~ industryLeader, games_df.filtered)

Boxplot shows games with median playtime above two hours or games from industry leaders recieve higher amount of recommendations on average.

boxplot(logRecommendations ~ languagesCount, games_df.filtered)

Games with more supported languages also were shown to recieve more recommendations.

price:

hist(games_df.filtered$price, main = "Price distribution")

summary(games_df.filtered$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.35    4.99   11.41   15.07   19.99   69.99
boxplot(price ~ industryLeader, data = games_df.filtered)

boxplot(price ~ aboveTwoHours, data = games_df.filtered)

Games from industry leaders or with over two hours median playtime are on average higher priced.

priceRange

0-10 = ‘Low’ 11-20 = ‘Medium’ 21 or above = ‘High’

# Define price ranges
price_breaks <- c(0, 10, 20, Inf) 

# Create labels for the price ranges
price_labels <- c("Low", "Medium", "High")

# Create the categorical variable
games_df.filtered$priceRange <- cut(games_df.filtered$price, breaks = price_breaks, labels = price_labels, right = TRUE)

table(games_df.filtered$priceRange)
## 
##    Low Medium   High 
##   2867   1861   1078
boxplot(logRecommendations ~ priceRange, data=games_df.filtered)

Boxplot shows the price range of games have some positive relationships with log-recommendations.

releaseDate

This feature is the date that the game has been published/released on the Steam platform specifically, and does not reflect the earliest release date of a game on any other platform.

I will perform some feature engineering on the releaseDate variable to obtain years since released, days since released, released month, and released year:

# extract month from `releaseDate` and convert it to a factor with levels in chronological order
games_df.filtered <- games_df.filtered %>%
  mutate(releaseMonth = month(releaseDate)) %>%
  mutate(releaseYear = year(releaseDate)) %>% 
  mutate(yearsReleased = 2024 - releaseYear) %>% # Years since released
  mutate(daysReleased = as.numeric(as.Date("2024-10-01") - releaseDate)) # days since released

 
# boxplots
boxplot(logRecommendations ~ yearsReleased,
        data = games_df.filtered)

boxplot(price ~ yearsReleased,
        data = games_df.filtered)

It is shown that price of games decreases gradually as released years gets higher until the price reaches around 10.

plot(games_df.filtered$daysReleased, games_df.filtered$logRecommendations, pch = 20,
     cex = 0.8, xlab = "Days Released", ylab = "Log Recommendations")

The recommendations count also seems to decrease slightly as days released gets higher.

More boxplots on price:

par(mfrow = c(1,2))
boxplot(price ~ yearsReleased,
        data = games_df.filtered[games_df.filtered$selfPublished == TRUE,],
        main = "Self-Published")
boxplot(price ~ yearsReleased,
        data = games_df.filtered[games_df.filtered$selfPublished == FALSE,],
        main = "Non-Self-Published")

Self-Published games are shown to have lower price on average, compared to other games throughout the years. Some interaction between selfPublished and yearsReleased is observed when looking at their effects on price.

hist(games_df.filtered$releaseYear, main = "#Games Released by year", breaks = 30)

The surge in games listed on Steam from 2013 to 2016 could be due to changes in Valve (Steam) policies that allowed developers to submit their games for community voting and approvals. After that, a rapid drop from 2016 onwards could be due to some of the game records missed during data collection.

I will remove games released on Steam from 1995 - 2008 since these older games that could introduce patterns that no longer hold true and would not be relevant for the task of modelling based on more recent dynamics of the market.

games_df.filtered <- games_df.filtered %>% filter(releaseDate > '2008-12-31')
dim(games_df.filtered)
## [1] 5596   40

Some other numeric features:

positive/negative

Number of positive or negative reviews.

I created a new variable which reflects the proportion of positive reviews as positivePct.

games_df.filtered <- games_df.filtered %>%
  mutate(positivePct = coalesce(positive/(positive + negative), 0))

hist(games_df.filtered$positivePct, main = "Positive Pct")

plot(x = games_df.filtered$positivePct, y = games_df.filtered$logRecommendations, pch = 20,
     cex = 0.8, xlab = "Positive pct.", ylab = "Log Recommendations")

Most games have a high positive rating percentage, with the most amount of games with 80-90% positive reviews.

achievements

This feature contains number of achievements/milestones available in a game. The distribution of achievement amount is right-skewed with the long tail on higher amount of achievements. I will log transform this feature as a new feature for easier interpretation.

print(summary(games_df.filtered$achievements))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   10.00   25.00   50.71   46.00 5000.00
hist(games_df.filtered$achievements, breaks = 30)

games_df.filtered <- games_df.filtered %>% mutate(logAchievements = log(1 + achievements))
hist(games_df.filtered$logAchievements, breaks = 30)

It seems that most games have zero achievements (log(1)), but many also have extremely large number of achievements

boxplot(logAchievements ~ aboveTwoHours, data = games_df.filtered)

boxplot(logAchievements ~ priceRange, data = games_df.filtered)

It is shown that games with median playtime over two hours have at least some achievements in the game, and games with higher price have more achievements on average.

dlcCount

This feature contains number of downloadable contents “DLC” available in a game.

hist(games_df.filtered$dlcCount, breaks = 100)

I will cap the dlcCount to 30, which is a suitable cap for most games, games with over 30 DLCs usually have a very specific design which is to update and expand gaming contents constantly, there are only 56 games with over 30 DLCs.

head(games_df.filtered[games_df.filtered$dlcCount > 30,][1:6])
appID name releaseDate estimatedOwners required_age price
14 730310 DYNASTY WARRIORS 9 2018-02-13 1000000 - 2000000 0 44.99
76 270880 American Truck Simulator 2016-02-02 2000000 - 5000000 0 19.99
315 24010 Train Simulator Classic 2022-04-21 1000000 - 2000000 0 24.99
405 744060 Groove Coaster 2018-07-16 100000 - 200000 0 19.99
480 678950 DRAGON BALL FighterZ 2018-01-26 2000000 - 5000000 0 59.99
512 45760 Ultra Street FighterR IV 2014-08-07 500000 - 1000000 0 29.99
games_df.filtered <- games_df.filtered %>% 
  mutate(dlcCount = ifelse(dlcCount > 30, 30, dlcCount))
hist(games_df.filtered$dlcCount, breaks = 100, main = "Histogram of DLC Count")

As the distribution is heavily skewed, I will log-transform the dlcCount feature as a new feature for interpretation.

games_df.filtered <- games_df.filtered %>% mutate(logDlcCount = log(1 + dlcCount))
hist(games_df.filtered$logDlcCount, main = "Histogram of log DLC count", xlab="log DLC count", breaks = 30)

The log transformed values for dlcCount have a exponential distribution.

boxplot(logRecommendations ~ dlcCount, data = games_df.filtered)

It is shown that games with more dlc count leads to higher recommendations, there is more fluctuations in recommendations count for games with over 13 DLCs as there is not much data for this type of games, but their recommendation count remains on the relatively high range on average regardless.

Final look at cleaned data

skim(games_df.filtered)
Data summary
Name games_df.filtered
Number of rows 5596
Number of columns 43
_______________________
Column type frequency:
character 7
Date 1
factor 2
logical 3
numeric 30
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
appID 0 1 3 7 0 5596 0
name 0 1 1 81 0 5576 0
languages 0 1 7 330 0 2525 0
fullAudioLanguages 0 1 0 319 2352 659 0
developers 0 1 2 142 0 3612 0
publishers 0 1 0 80 28 2446 0
genres 0 1 0 93 8 402 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
releaseDate 0 1 2009-01-07 2024-05-16 2017-02-02 2496

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
estimatedOwners 0 1 FALSE 9 200: 1329, 100: 1223, 500: 1108, 200: 742
priceRange 0 1 FALSE 3 Low: 2699, Med: 1823, Hig: 1074

Variable type: logical

skim_variable n_missing complete_rate mean count
age16 0 1 0.08 FAL: 5149, TRU: 447
selfPublished 0 1 0.44 FAL: 3121, TRU: 2475
industryLeader 0 1 0.18 FAL: 4604, TRU: 992

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
required_age 0 1 1.62 4.93 0.00 0.00 0.00 0.00 18.00 ▇▁▁▁▁
price 0 1 15.27 12.68 0.35 4.99 11.99 19.99 69.99 ▇▅▂▁▁
dlcCount 0 1 1.75 4.30 0.00 0.00 0.00 1.00 30.00 ▇▁▁▁▁
positive 0 1 5769.60 13183.27 44.00 545.00 1390.00 4299.50 119516.00 ▇▁▁▁▁
negative 0 1 923.26 2308.34 3.00 109.00 259.00 704.00 41287.00 ▇▁▁▁▁
achievements 0 1 50.71 243.69 0.00 10.00 25.00 46.00 5000.00 ▇▁▁▁▁
recommendations 0 1 5278.38 11765.46 101.00 497.75 1355.50 4038.00 99084.00 ▇▁▁▁▁
averagePlaytime 0 1 674.97 1481.90 2.00 165.00 291.00 651.00 51388.00 ▇▁▁▁▁
medianPlaytime 0 1 587.67 1982.25 3.00 168.00 278.00 553.25 102435.00 ▇▁▁▁▁
developerCount 0 1 1.17 0.51 1.00 1.00 1.00 1.00 10.00 ▇▁▁▁▁
publisherCount 0 1 1.13 0.41 0.00 1.00 1.00 1.00 5.00 ▇▁▁▁▁
genresCount 0 1 2.66 1.28 0.00 2.00 2.50 3.00 9.00 ▂▇▂▁▁
languagesCount 0 1 6.57 5.51 1.00 1.00 6.00 10.00 29.00 ▇▅▁▁▁
fullAudioLanguagesCount 0 1 1.96 3.63 0.00 0.00 1.00 2.00 29.00 ▇▁▁▁▁
estimatedSkew 0 1 -0.05 0.43 -1.00 -0.33 -0.09 0.25 0.99 ▂▆▇▅▂
playtimeRatio 0 1 1.46 3.36 0.50 0.75 0.92 1.34 146.20 ▇▁▁▁▁
averagePlayHours 0 1 11.25 24.70 0.03 2.75 4.85 10.85 856.47 ▇▁▁▁▁
medianPlayHours 0 1 9.79 33.04 0.05 2.80 4.63 9.22 1707.25 ▇▁▁▁▁
averagePlayDays 0 1 0.47 1.03 0.00 0.11 0.20 0.45 35.69 ▇▁▁▁▁
medianPlayDays 0 1 0.41 1.38 0.00 0.12 0.19 0.38 71.14 ▇▁▁▁▁
logMedianPlaytime 0 1 5.71 1.06 1.39 5.13 5.63 6.32 11.54 ▁▆▇▁▁
aboveTwoHours 0 1 0.85 0.36 0.00 1.00 1.00 1.00 1.00 ▂▁▁▁▇
logRecommendations 0 1 7.33 1.51 4.62 6.21 7.21 8.30 11.50 ▅▇▆▃▁
releaseMonth 0 1 6.65 3.34 1.00 4.00 7.00 10.00 12.00 ▇▆▅▆▇
releaseYear 0 1 2016.65 3.33 2009.00 2014.00 2017.00 2019.00 2024.00 ▃▇▇▆▂
yearsReleased 0 1 7.35 3.33 0.00 5.00 7.00 10.00 15.00 ▃▆▇▅▂
daysReleased 0 1 2771.54 1213.54 138.00 1813.00 2798.00 3609.00 5746.00 ▃▆▇▅▂
positivePct 0 1 0.81 0.13 0.15 0.75 0.84 0.91 0.99 ▁▁▁▅▇
logAchievements 0 1 2.79 1.60 0.00 2.40 3.26 3.85 8.52 ▅▆▇▁▁
logDlcCount 0 1 0.57 0.77 0.00 0.00 0.00 0.69 3.43 ▇▅▂▁▁

Correlation Analysis in games data

numeric_df <- games_df.filtered[,c('logRecommendations','recommendations','price','estimatedSkew','positivePct','dlcCount','medianPlaytime','logMedianPlaytime','daysReleased','languagesCount','aboveTwoHours','selfPublished','age16','industryLeader')]

# Calculate correlation matrix
corr_matrix <- cor(numeric_df)

corrplot(corr_matrix, method = "color", type = "upper", 
         addCoef.col = "black", # Add correlation coefficients in black color
         tl.col = "black",tl.srt = 45,tl.cex = 0.8, # Text label color and rotation
         number.cex = 0.55) # Adjust size of the numbers

Selected Predictors

Predictors Selection for recommendations as the regression target variable:

It could be seen that estimatedSkew, price, positivePct, languagesCount, dlcCount, age16 and industryLeader has a correlation above 0.2 with the log target variable, also, log transformed medianPlaytime have a higher correlation value with the log target variable compared to its original value, also above 0.2.

chosen features: estimatedSkew, price, positivePct, languagesCount, age16, industryLeader, dlcCount, logMedianPlaytime,

Predictors Selection for priceRange as the classification target variable:

dlcCount, daysReleased, languagesCount, age16, industryLeader, selfPublished and aboveTwoHours have relatively strong correlation with the target, also the log transformed medianPlaytime and recommendations have a much higher correlation than its original value. Variables with weaker correlations are included for this problem because the models chosen are more flexible and robust to linearity assumptions.

chosen features: dlcCount, daysReleased, languagesCount, age16, industryLeader, selfPublished, aboveTowHours, logMedianPlaytime, logRecommendations

Scatter plot for regression features

ggpairs(games_df.filtered[, c("logRecommendations", "estimatedSkew", "price", 
                              "positivePct", "languagesCount",
                              "logMedianPlaytime", "dlcCount","age16", "industryLeader")],
  lower = list(continuous = wrap("points", size = 0.5)),
  progress = FALSE
  ) +
  theme(
    strip.text.y = element_text(angle = 0, size = 8.5),  # Make strip labels horizontal
    strip.text.x = element_text(size = 8.5)
      )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tags_df data cleaning

The tags feature contains a list of user-generated tags voted to a game, and a inner-list containing the frequency of that tag, the extracted dataframe tags_df is in a long format, where each row represents a tag and frequency for a game. The games in this dataframe are filtered so that it only includes games from games_df.filtered dataframe.

# Filter out to only include games from games_df.filtered 
tags_df.filtered <- tags_df %>%
  filter(appID %in% games_df.filtered$appID)

tags_df.filtered <- tags_df.filtered %>%
  mutate(tag = make.names(tag))    # make sure names comply with R's variable naming scheme

head(tags_df.filtered,10)
appID tag frequency
1026420 Tactical.RPG 255
1026420 Turn.Based.Strategy 248
1026420 Wargame 248
1026420 Historical 239
1026420 Strategy.RPG 236
1026420 Perma.Death 231
1026420 RPG 225
1026420 Difficult 221
1026420 Turn.Based.Combat 219
1026420 X2D 215

I will extract only the top 20 tags of each game, based on the frequency of the voted tag for the game, this is because Steam also uses only the top 20 tags to help in querying games for users, and having more than 20 tags to describe a game may introduce more irrelevant/rare tags for a game. https://partner.steamgames.com/doc/store/tags#

# top 20 tags for each appID
top_tags <- tags_df.filtered %>%
  group_by(appID) %>%
  mutate(frequency = as.numeric(frequency)) %>%
  arrange(desc(frequency)) %>%
  mutate(rank = row_number()) %>%
  filter(rank <= 20) %>%
  #mutate(tagPct = as.numeric(frequency / sum(frequency))) %>%# add proportion column
  ungroup()

head(top_tags)
appID tag frequency rank
219990 Action.RPG 34590 1
392110 Strategy 30531 1
370910 Point…Click 13760 1
231430 Strategy 8346 1
252130 Puzzle 7706 1
285800 Strategy 7582 1
length(unique(top_tags$tag))
## [1] 433

Top and bottom 20 used tags

tags_freq <- top_tags %>% 
  group_by(tag) %>%
  summarise(frequency = sum(frequency)) %>%
  arrange(desc(frequency))

ggplot(head(tags_freq, 20), 
       aes(x = reorder(tag, frequency), y = frequency)) + # reorder tags by frequency
geom_bar(stat = "identity") +  
coord_flip() +                  
labs(title = "Top 20 tags voted by players", x = "tags", y = "Frequency")

ggplot(tail(tags_freq, 20), aes(x = reorder(tag, frequency), y = frequency)) + # reorder tags by frequency
geom_bar(stat = "identity") +  
coord_flip() +                  
labs(title = "Bottom 20 tags voted by players", x = "tags", y = "Frequency")

These tags were voted less than 300 times (all games combined).

Unsupervised Learning

For the Unsupervised learning part of this coursework, I will be using the user-generated tags frequency for each game to find clusters of tags that are similar.

Euclidean distance based Hierarchical Clustering

Hierarchical clustering can be used to find common groups within different tags. The euclidean distance between tags based on their usage and frequency across different games can be used to find tags that are “close” to each other.

# Transpose and Pivot to wide format
tags_games <- pivot_wider(top_tags, 
                              id_cols = tag,  # row
                              names_from = appID, # col
                              values_from = frequency) # value

tags_games[is.na(tags_games)] <- 0  # fill na to zero
colnames(tags_games) <- make.names(colnames(tags_games)) # make sure names comply with R's variable naming scheme
head(tags_games[1:6])
tag X219990 X392110 X370910 X231430 X252130
Action.RPG 34590 0 0 0 0
Strategy 0 30531 0 8346 58
Point…Click 0 0 13760 0 0
Puzzle 0 0 54 0 7706
Hack.and.Slash 7225 0 0 0 0
Dark.Fantasy 7212 0 0 0 0

Format as matrix form and get distance matrix

tags_games.labs <- tags_games$tag
tags_games.data <- tags_games[,-1]# exclude first col
# scale the frequency
tags_games.scaled <- scale(tags_games.data) 
# calculate distances
tags_games.dist <- dist(tags_games.scaled)

Hierarchical clustering using distance matrix

plot(hclust(tags_games.dist),
     labels = tags_games.labs,
     main = "Euclidean-distance Clustering",
     xlab = "",
     ylab = "",
     cex = 0.2)

plot(hclust(tags_games.dist, method = "average"),
     labels = tags_games.labs,
     main = "Average Linkage",
     xlab = "",
     ylab = "",
     cex = 0.3)

Look at cluster assignments when K = 7.

tags_games.dist_hclust <- hclust(dist(tags_games.scaled))
dist_hclust.clusters <- cutree(tags_games.dist_hclust, 10)

data.frame(table(dist_hclust.clusters))
dist_hclust.clusters Freq
1 421
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 4
10 1

Look at some tags in cluster 1 and 9

dist_clusters <- data.frame(tag = tags_games.labs, cluster = dist_hclust.clusters)

head(dist_clusters[dist_clusters['cluster'] == 1],10)
##  [1] "Action.RPG"        "Point...Click"     "Hack.and.Slash"   
##  [4] "Dark.Fantasy"      "Metroidvania"      "Early.Access"     
##  [7] "FPS"               "Real.Time.Tactics" "Space.Sim"        
## [10] "X4X"
head(dist_clusters[dist_clusters['cluster'] == 9],10)
## [1] "Multiplayer"  "Singleplayer" "Co.op"        "Online.Co.Op" " 9"          
## [6] " 9"           " 9"           " 9"

When the tags are seperated into 10 clusters, it could be seen that most of the tags are grouped in cluster 1, while cluster 9 for example, contains popular tags like “Multiplayer”, “Online Co-op” and “Co-op”, this could be due to the fact that the euclidean distance takes into account the raw frequency of the tags voted for each game, since each game only consists of 10-20 tags, the distribution of tag frequency for each game would be zero-inflated, this zero-inflation can lead to many tags having low or no frequencies across games, which is most likely why they have been grouped together, and since user-generated Tag frequencies are heavily influenced by game popularity, a popular game will naturally have higher frequencies of tags, this introduces tag outliers, where tags that ties with very popular games would be grouped by itself because they have longer euclidean distance to any other tags.

I will then try a correlation-based distance to measure dissimilarity between tags, which focuses on relationships between tags rather than their raw frequency/magnitude.

Correlation-based distance Hierarchical Clustering

# corr matrix
tags_games.corr <- cor(t(tags_games.scaled))
# 1 - correlation to convert to correlation-based dissimilarity matrix
tags_games.corr_dist <- as.dist(1 - tags_games.corr)

plot(hclust(tags_games.corr_dist),
     labels = tags_games.labs,
     main = "Complete Linkage",
     xlab = "",
     ylab = "",
     cex = 0.3)

plot(hclust(tags_games.corr_dist, method = "average"),
     labels = tags_games.labs,
     main = "Average Linkage",
     xlab = "",
     ylab = "",
     cex = 0.3)

plot(hclust(tags_games.corr_dist),
     labels = tags_games.labs,
     main = "Correlation-based distance Clustering",
     xlab = "",
     ylab = "",
     cex = 0.2)
rect.hclust(hclust(tags_games.corr_dist), k = 7)

Look at cluster assignments frequency when K = 7.

corr_hc <- hclust(tags_games.corr_dist)

corr_hc.clusters <- cutree(corr_hc, k = 7)
tag_cluster <- data.frame(tag = tags_games.labs, cluster = corr_hc.clusters)%>%
  arrange(cluster)

data.frame(table(corr_hc.clusters))
corr_hc.clusters Freq
1 86
2 39
3 59
4 173
5 19
6 30
7 27

Look at tag assignments from correlation matrix of raw tag frequencies

# print out first 10 tags in each cluster
for (i in 1:length(unique(tag_cluster$cluster))) {
  cat('\ncluster',i,': [',head(tag_cluster[tag_cluster['cluster']==i],20),
      ',...]')
}
## 
## cluster 1 : [ Action.RPG Hack.and.Slash Dark.Fantasy X4X PvE Loot Medieval Hacking Bullet.Hell Dungeon.Crawler Rogue.lite Souls.like Rogue.like Immersive.Sim Replay.Value Action.Roguelike X3D Dog Deckbuilding Card.Battler ,...]
## cluster 2 : [ Strategy Real.Time.Tactics RTS Grand.Strategy Wargame Real.Time Racing City.Builder Simulation Colony.Sim Life.Sim Sandbox Physics Realistic Automobile.Sim VR Building Historical Driving Transportation ,...]
## cluster 3 : [ Point...Click Adventure Metroidvania Horror Great.Soundtrack Story.Rich Walking.Simulator Pixel.Graphics Choices.Matter Difficult Singleplayer X2D Multiple.Endings Drama Cinematic Roguevania Platformer Dark Relaxing Exploration ,...]
## cluster 4 : [ Puzzle MMORPG Indie Casual Artificial.Intelligence Sailing Sports World.War.I Tanks Surreal Psychedelic Jet Steampunk Parkour Logic Short Beautiful Puzzle.Platformer Comic.Book World.War.II ,...]
## cluster 5 : [ Early.Access FPS Space.Sim PvP Open.World Survival Open.World.Survival.Craft Multiplayer Space First.Person Team.Based Co.op Shooter Sci.fi Science Online.Co.Op Crafting Voxel Mars 5 ,...]
## cluster 6 : [ RPG Sexual.Content Nudity Hentai Mature Turn.Based.Tactics Magic LGBTQ. Romance Female.Protagonist Fantasy Cute Turn.Based.Combat JRPG Character.Customization Turn.Based.Strategy Anime Turn.Based NSFW Strategy.RPG ,...]
## cluster 7 : [ Comedy Action Funny Zombies Stealth Gore Arena.Shooter Combat Assassin Violent Post.apocalyptic Dark.Humor Action.Adventure Third.Person Looter.Shooter Third.Person.Shooter Blood Fast.Paced Inventory.Management Satire ,...]

Based on the dendrogram of clustering using complete linkage method, it could be seen that there are 7 major clusters based on a correlation-based distance between tags. It is also shown that each of the clusters have tags that are more similar to each other compared to euclidean-based distance. For example, strategic shooting games-related tags are all in the same cluster, and singleplayer or roleplaying games-related tags are in another cluster, and so on.

The results shows that using correlation-based distances can help mitigate some issues associated with zero-inflation, as it focuses on the relative patterns of usage rather than absolute counts.

There is one cluster that contains a lot more tags than any other clusters, this cluster is formed mostly by merging a lot of other smaller clusters during the clustering process, and have formed a single “branch” of its own, with many inner clusters merged sequentially, this could indicate that the tags within this cluster are heterogeneous, and that it contains a diverse mix of tags with weak or no meaningful relationships between them. Within this cluster, some commonly used tags like “MMORPG” and “Casual” are present, while the remaining tags within the cluster seems to be too specific or unrelated to each other for the algorithm to group them meaningfully, which could be the reason why they are all grouped together in this one cluster.

PCA

Based on previous results in hierarchical clustering, it has been shown that correlation-based dissimilarity distances are useful for grouping tags into clusters. To further improve the quality of clusters, I will try reducing dimensionality of the tags data before performing clustering to minimize noise and redundant information. In this context, Principal Component Analysis (PCA) can be utilized to identify linear combinations of different tags that maximize the variance in the data.

PCA examines the relationships between tags based on their frequency across all games. Tags that frequently appear together tend to have strong loadings on the same principal components, indicating that they contribute similarly to the overall variance. Moreover, hierarchical clustering can then be applied to the principal component scores, using the distances between these projected loading scores to derive more refined clusters.

tags.wide <- pivot_wider(top_tags, 
                              id_cols = appID,  # row
                              names_from = tag, # col
                              values_from = frequency) # value
tags.wide[is.na(tags.wide)] <- 0  # fill na to zero
colnames(tags.wide) <- make.names(colnames(tags.wide)) # this make sure names comply with R's variable naming scheme
head(tags.wide[1:6])
appID Action.RPG Strategy Point…Click Puzzle Hack.and.Slash
219990 34590 0 0 0 7225
392110 0 30531 0 0 0
370910 0 0 13760 54 0
231430 0 8346 0 0 0
252130 0 58 0 7706 0
285800 0 7582 0 0 0
dim(tags.wide)
## [1] 5596  434

Perform PCA

# scale first
tags_scaled <- scale(tags.wide[,-1])

pca_result <- prcomp(t(tags_scaled))

Choose number of components

explained_variance <- summary(pca_result)$importance[2, ] # Proportion of variance explained
cumulative_variance <- cumsum(explained_variance) # Cumulative variance explained
num_components <- which(cumulative_variance >= 0.9)[1] # Retain components to explain 90% of variance
plot(cumulative_variance,
     xlab = 'Principal Components', 
     ylab = "Cumulative PVE",
     main = "PCA Screeplot")

# vertical line at num_components
abline(v = num_components, col = "red", lwd = 2, lty = 2) 

# text next to the vertical line
text(num_components + 0.5, 0.85, 
     labels = paste("90% PVE at PC", num_components), 
     col = "red", pos = 4)

Project tags data onto PCA space

# project the tag frequencies onto the PC space
pca_scores <- as.data.frame(pca_result$x)

# Subset the PC up to 90% PVE
pca_scores_subset <- pca_scores[, 1:num_components]

head(pca_scores_subset[1:6])
PC1 PC2 PC3 PC4 PC5 PC6
Action.RPG -10.802061 8.575841 -16.570341 1.199756 -18.2563128 18.818229
Strategy 18.571310 -1.686384 -5.990351 -5.520766 -9.2062502 -4.053358
Point…Click -3.956126 1.630471 5.245800 -3.629615 0.9030723 6.578221
Puzzle -12.648316 -11.923013 11.670569 -8.818684 0.0965081 -9.752322
Hack.and.Slash -15.188342 12.078385 -21.529573 2.414603 -21.9329717 10.655177
Dark.Fantasy -12.971287 10.946296 -19.037150 1.014465 -25.2052921 16.420057

Calculate Correlation-based distance matrix of tags based on PCs

# calculate the correlation matrix
pca_cor_matrix <- cor(t(pca_scores_subset))

# Convert the correlation matrix to a dissimilarity distance matrix
pca_cor_distance <- as.dist(1 - pca_cor_matrix)

# perform hierarchical clustering
pca_hclust <- hclust(pca_cor_distance, method = "complete")

Examining cluster assignments

Plotting dendrogram with colored branches and group number, used modified code from: https://stackoverflow.com/questions/48636522/label-r-dendrogram-branches-with-correct-group-number

pca_clusters <- cutree(pca_hclust, k = 7)

# Create dendrogram
dend <- as.dendrogram(pca_hclust)

colors = RColorBrewer::brewer.pal(7, "Dark2")

# below code is from: https://stackoverflow.com/questions/57674446/how-to-associate-cluster-labels-and-dendrogram-in-the-same-order-on-a-plot?noredirect=1&lq=1 
dend.cutree <- dendextend:::cutree(pca_hclust, k=7, order_clusters_as_data = FALSE)

idx <- order(names(dend.cutree))
dend.cutree <- dend.cutree[idx]
df.merge <- merge(pca_clusters,dend.cutree,by='row.names')
df.merge.sorted <- df.merge[order(df.merge$y),]
lbls<-unique(df.merge.sorted$x)
dend1 <- color_branches(pca_hclust, k = 7, groupLabels = lbls, col = colors[lbls])
plot(dend1, leaflab = 'none', main = "Tag Clustering w/ PC score correlation-distance")

Around 7 major clusters could be determined after cutting the dendrogram.

pca_clusters <- cutree(pca_hclust, k = 7)

print(data.frame(table(pca_clusters)))
##   pca_clusters Freq
## 1            1   17
## 2            2   23
## 3            3  231
## 4            4   19
## 5            5   80
## 6            6   34
## 7            7   29

PC variance and PCs plot with cluster assignments

# calculate variance explained
pca.var <- pca_result$sdev^2
pca.var.per <- round(pca.var/sum(pca.var), 3)

pca.data <- data.frame(tag = rownames(pca_result$x),
                       X = pca_result$x[,1],
                       Y = pca_result$x[,2],
                       Z = pca_result$x[,3])

pca.data$cluster <- factor(pca_clusters)

ggplot(data = pca.data,
       aes(x = X, y = Y, label = tag, color = cluster)) +
  geom_text(size = 3) +
  xlab(paste("PC1, PVE: ", pca.var.per[1], sep = "")) +
  ylab(paste("PC2, PVE: ", pca.var.per[2], sep = "")) +
  ggtitle("PCA plot of first two principal components") +
  scale_color_brewer(palette = "Dark2")

3D plot for first 3 PCs

# Open a new 3D plotting window and create an empty plot (type = "n")
plot3d(pca.data$X, pca.data$Y, pca.data$Z,
       type = "n",  # Do not plot points immediately
       xlab = paste("PC1, PVE: ", pca.var.per[1], sep = ""),
       ylab = paste("PC2, PVE: ", pca.var.per[2], sep = ""),
       zlab = paste("PC3, PVE: ", round(pca.var.per[3], 3), sep = ""))

# Add text labels at each point's coordinates.
# Here, 'texts' are taken from the 'tag' column and colored according to the cluster.
text3d(pca.data$X, pca.data$Y, pca.data$Z,
       texts = pca.data$tag,
       col = pca.data$cluster,
       cex = 0.7)  # Adjust 'cex' to control text size

Tag clustering from correlation matrix of pc scores of each tag (first 10 tags of each cluster).

tag_pc_cluster <- data.frame(tag = names(pca_clusters), 
                             cluster = pca_clusters)

# print out first 10 tags in each cluster
for (i in 1:length(unique(tag_pc_cluster$cluster))) {
  cat('\ncluster',i,': [',head(tag_pc_cluster[tag_pc_cluster['cluster']==i],10),
      ',...]')
}
## 
## cluster 1 : [ Action.RPG Hack.and.Slash Dark.Fantasy RPG Loot Magic Dungeon.Crawler Fantasy Dark Character.Customization ,...]
## cluster 2 : [ Strategy Real.Time.Tactics Space.Sim X4X RTS Grand.Strategy Wargame PvP PvE Real.Time ,...]
## cluster 3 : [ Point...Click Puzzle Metroidvania Racing Medieval Great.Soundtrack Bullet.Hell Female.Protagonist Platformer Zombies ,...]
## cluster 4 : [ Adventure FPS Multiplayer Story.Rich Singleplayer First.Person Action Co.op Dog Atmospheric ,...]
## cluster 5 : [ Early.Access Sexual.Content MMORPG Nudity Hentai Mature City.Builder Turn.Based.Tactics Walking.Simulator Hacking ,...]
## cluster 6 : [ Horror Indie Rogue.lite Souls.like Pixel.Graphics Rogue.like Difficult X2D Replay.Value Action.Roguelike ,...]
## cluster 7 : [ Casual Life.Sim LGBTQ. Choices.Matter Comedy Immersive.Sim Multiple.Endings Funny Romance Drama ,...]

Overall, the resulting cluster assignments slightly matched those from clustering the raw tag frequencies directly, but due to concerns with zero-inflation, which might distort principal components, and the fact that there is not much reduction in data dimensionality (from 433 to 252 features), I have chose to use the clustering results from the original correlation matrix of raw tag frequencies.

Further Analysis in logRecommendations and price using defined clusters of tags.

I will first assign the tag names to respective cluster number based on previous hierarchical clustering results, and then transform the cluster frequencies into weights based on the proportion of the individual clusters.

# Assign tag names of different games to cluster number and regroup
appClusterFrequency <-  top_tags %>%
  left_join(tag_cluster, by = "tag") %>%
  dplyr::select(appID,cluster, frequency) %>%
  group_by(appID, cluster) %>%
  summarise(frequency = sum(frequency))
## `summarise()` has grouped output by 'appID'. You can override using the
## `.groups` argument.
head(appClusterFrequency)
appID cluster frequency
1000010 1 1315
1000010 2 407
1000010 3 827
1000010 6 1483
1000010 7 179
1000030 1 521

Calculate weights

appClusterWeights <- appClusterFrequency %>%
  group_by(appID) %>%
  mutate(weight = frequency / sum(frequency)) %>%
  ungroup()

head(appClusterWeights, 10)
appID cluster frequency weight
1000010 1 1315 0.3122774
1000010 2 407 0.0966516
1000010 3 827 0.1963904
1000010 6 1483 0.3521729
1000010 7 179 0.0425077
1000030 1 521 0.1507087
1000030 2 582 0.1683541
1000030 3 702 0.2030662
1000030 4 666 0.1926526
1000030 5 331 0.0957478
# Pivot Frequency to wide format
clusterWeight.wide <- pivot_wider(appClusterWeights, 
                              id_cols = appID,  # row
                              names_from = cluster, # col
                              values_from = weight) # value

clusterWeight.wide[is.na(clusterWeight.wide)] <- 0  # fill na to zero
colnames(clusterWeight.wide) <- make.names(colnames(clusterWeight.wide)) # this make sure names comply with R's variable naming scheme
head(clusterWeight.wide)
appID X1 X2 X3 X6 X7 X4 X5
1000010 0.3122774 0.0966516 0.1963904 0.3521729 0.0425077 0.0000000 0.0000000
1000030 0.1507087 0.1683541 0.2030662 0.0000000 0.1894706 0.1926526 0.0957478
1000080 0.2706490 0.0000000 0.1563422 0.2249263 0.1017699 0.1172566 0.1290560
1000360 0.3370246 0.1969186 0.0332210 0.0000000 0.2224362 0.1304766 0.0799230
1002560 0.0000000 0.0000000 0.2739726 0.4383562 0.0575342 0.2301370 0.0000000
1003520 0.0000000 0.0421730 0.1022159 0.7598284 0.0000000 0.0829164 0.0128663

A cluster count variable will also be derived and examined in the following code.

cluster.count <- clusterWeight.wide %>%
  mutate(clusterCount = rowSums(dplyr::select(., -appID) != 0)) %>%
  dplyr::select(appID, clusterCount)
head(cluster.count)
appID clusterCount
1000010 5
1000030 6
1000080 6
1000360 6
1002560 4
1003520 5
hist(cluster.count$clusterCount)

# join game data to add `logRecommendations` feature
clusterRecommendations <- left_join(games_df.filtered[,c("appID","logRecommendations","price")], cluster.count, by = "appID")
clusterRecommendations <- left_join(clusterRecommendations,  clusterWeight.wide, by = "appID")

clust_corr_matrix <- cor(clusterRecommendations[,-1])

corrplot(clust_corr_matrix, method = "color", type = "upper", 
         addCoef.col = "black", # Add correlation coefficients in black color
         tl.col = "black",tl.srt = 45,tl.cex = 1, # Text label color and rotation
         number.cex = 0.8) # Adjust size of the numbers

It could be seen that cluster 1 and cluster 5 have some linear associations with log recommendations and price. The cluster counts was also shown to have some linear associations with recommendations, since each cluster represents a group of tags that are related to each other, having presence in more clusters means the game have tags that are diverse and not redundant. For this reason, I will include the cluster weights and cluster counts for my regression problem as predictors. Similarly, Cluster weights will be included for my classification task too.

Regression Model

For the regression task, I will use the Negative Binomial Regression model to predict recommendations, since this is a count variable with variance a lot higher than the mean, a Negative Binomial Regression model is suitable because it is specifically designed to handle overdispersion of a count variable.

The initial selected predictors for the models will be:estimatedSkew, price, positivePct, languagesCount, age16, industryLeader, logMedianPlaytime, dlcCount, clusterCount, and the cluster weights. Since there are a total of 17 features, a subset selection process with be implemented to determine a smaller subset of features or coefficients that best optimizes the predictive performance of the model and to simplify the model.

Regression task

The data is split into (8:2) training and testing partitions: - The training partition would be used to estimate a generalized test error for different models. - The testing partition would be used to evaluate the final test error of the selected model.

Three models would be considered for regression task, namely, the Negative Binomial Regression, random forest, and xgboost. The Negative binomial regression model assumes linear relationships between the log-transformed target variable and the independent features, which is why I have chosen features that are most correlated with the log-transformed target variable, medianPlaytime are also log-transformed because it were shown to produce stronger correlation to the log-transformed target.

# features used for both GLM and tree-based models
regr.features <- left_join(games_df.filtered[,c('recommendations','appID','estimatedSkew', 'price', 'positivePct', 'languagesCount', 'age16', 'industryLeader', 'logMedianPlaytime', 'dlcCount')],
                               clusterWeight.wide, 
                               by = 'appID') 

regr.features <- left_join(regr.features, cluster.count, by = 'appID') %>% dplyr::select(-appID) 

# define task to store all features
regr_task = as_task_regr(regr.features,
                  target = "recommendations",
                  id = "regr task all")

# specify that the feature "age16" and "industryLeader" is stratified during split
regr_task$set_col_roles("age16", c("feature","stratum"))
regr_task$set_col_roles("industryLeader", c("feature","stratum"))

# Set seed for reproducibility
set.seed(1)
# split train test partition
regr_splits <- partition(regr_task, ratio = 0.8)

print(regr_task)
## <TaskRegr:regr task all> (5596 x 17)
## * Target: recommendations
## * Properties: strata
## * Features (16):
##   - dbl (14): X1, X2, X3, X4, X5, X6, X7, clusterCount, dlcCount,
##     estimatedSkew, languagesCount, logMedianPlaytime, positivePct,
##     price
##   - lgl (2): age16, industryLeader
## * Strata: age16, industryLeader

Baseline Regression model

This model would be used as a baseline for comparison of other models.

# load featureless learner
regr_featureless = lrn("regr.featureless")

regr_featureless$train(regr_task, regr_splits$train)

prediction_bas = regr_featureless$predict(regr_task, regr_splits$test)

prediction_bas$score(msrs(c("regr.mse","regr.rmse","regr.rsq","regr.mae")))
##      regr.mse     regr.rmse           rsq      regr.mae 
##  1.139723e+08  1.067578e+04 -3.297003e-03  5.964871e+03

XGBoost

tuning and training XGBoost model with training data.

Training optimized XGBoost model on whole training partition

xgboost_lrn_tuned <- lrn("regr.xgboost")
xgboost_lrn_tuned$param_set$values = xgboost_tune_instance$result_learner_param_vals

xgboost_lrn_tuned$train(regr_task.train)

Feature importance

Feature importance based on contribution of each feature to the model’s objective function.

# Extract importance and convert to data.frame
xgb_regr_importance <- data.frame(
  feature = names(xgboost_lrn_tuned$importance()),
  importance = unname(xgboost_lrn_tuned$importance())
)

# Sort features by importance (descending)
xgb_regr_importance <- xgb_regr_importance %>%
  arrange(desc(importance))
print(xgb_regr_importance)
##              feature  importance
## 1      estimatedSkew 0.351427986
## 2  logMedianPlaytime 0.176837812
## 3        positivePct 0.115743044
## 4     languagesCount 0.077117969
## 5                 X5 0.062503218
## 6                 X3 0.051692464
## 7           dlcCount 0.026793850
## 8              price 0.024226841
## 9                 X7 0.020418356
## 10                X4 0.019165813
## 11                X6 0.017165894
## 12                X2 0.017083508
## 13                X1 0.016268271
## 14      clusterCount 0.009101299
## 15             age16 0.008550836
## 16    industryLeader 0.005902840
ggplot(xgb_regr_importance, aes(x = reorder(feature, importance), y = importance)) +
  geom_col(fill = "steelblue") +  # Bar color
  coord_flip() +  # Horizontal bars for readability
  labs(
    title = "Feature Importance in XGBoost Model",
    x = "Feature",
    y = "Importance Score"
  ) +
  theme_minimal()

Compare with Decision Tree using top 5 features

regr_rpart_lrn <- lrn("regr.rpart")

regr_rpart_subset <- regr_task$clone()

#look at how top 5 features from rforest would be plotted
regr_rpart_subset$select(names(head(xgboost_lrn_tuned$importance(),5)))

regr_rpart_lrn$train(regr_rpart_subset, row_ids = regr_splits$train)
regr_rpart_pred <- regr_rpart_lrn$predict(regr_rpart_subset,row_ids = regr_splits$test)

rpart.plot(regr_rpart_lrn$model)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

### Test results

xgboost_prediction <- xgboost_lrn_tuned$predict(regr_task, row_ids = regr_splits$test)

xgboost_prediction$score(msrs(c("regr.mse","regr.rmse","regr.rsq","regr.mae")))
##     regr.mse    regr.rmse          rsq     regr.mae 
## 6.692919e+07 8.181027e+03 4.108229e-01 3.773073e+03
xgboost_residuals <- xgboost_prediction$truth - xgboost_prediction$response


plot(xgboost_prediction$response, xgboost_prediction$truth, 
     main = "Actual vs. Predicted", 
     xlab = "Predicted Values", 
     ylab = "Actual Values",
     pch =20,
     cex = 0.8)
abline(0, 1)  # Line of perfect prediction

hist(xgboost_residuals, main = "Histogram of Test Residuals",xlab = "Residuals", breaks = 40)

plot(x = xgboost_prediction$response, y = xgboost_residuals, 
     main = "Residuals vs. Predicted Values", 
     xlab = "Predicted Values", 
     ylab = "Residuals",
     pch = 20,
     cex = 0.8)
abline(h = 0)

Visualize prediction using important features

# Get the test data from the task
test_data <- regr_task$data(rows = regr_splits$test)

# get predictions as df
predictions_df <- data.frame(
  truth = xgboost_prediction$truth,
  response = xgboost_prediction$response
)

# Combine test data with predictions
combined_data <- dplyr::bind_cols(test_data, predictions_df)


ggplot(combined_data, aes(x = estimatedSkew, y = response)) +
  geom_point()

ggplot(combined_data, aes(x = logMedianPlaytime, y = response)) +
  geom_point()

Negative Binomial Regression

Since the target recommendations is a count variable that has variance a lot higher than the mean, a Negative Binomial would be suitable for this problem. I will begin by first determining a subset of features using a forward-stepwise selection algorithm, with AIC as criterion of subset selection. The selected features would then be used in the cross-validation process to estimate the performance measures of the model with subset features.

Forward Stepwise selection for NB, with AIC as criterion

# get training split for subset selection

nb_regr.full <- glm.nb(as.formula(recommendations ~ .), data = regr_task.train$data())
nb_regr.null <- glm.nb(as.formula(recommendations ~ 1), data = regr_task.train$data())

fw_nb_regr <- step(nb_regr.null, 
                   scope = list(lower = nb_regr.null, upper = nb_regr.full), 
                   direction = "forward", 
                   k = 2,# AIC criterion
                   trace = FALSE) # suppress output
# summary of the selected model
nb_summary <- summary(fw_nb_regr)
print(nb_summary)
## 
## Call:
## glm.nb(formula = recommendations ~ estimatedSkew + logMedianPlaytime + 
##     languagesCount + positivePct + X5 + price + X4 + X3 + clusterCount + 
##     age16 + X1 + dlcCount + X2, data = regr_task.train$data(), 
##     init.theta = 1.290444021, link = log)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        2.788520   0.133944  20.818  < 2e-16 ***
## estimatedSkew      1.651871   0.033759  48.931  < 2e-16 ***
## logMedianPlaytime  0.270035   0.014182  19.041  < 2e-16 ***
## languagesCount     0.042923   0.002539  16.904  < 2e-16 ***
## positivePct        2.482547   0.104810  23.686  < 2e-16 ***
## X5                 1.928823   0.124934  15.439  < 2e-16 ***
## price              0.014065   0.001274  11.043  < 2e-16 ***
## X4                -0.663502   0.098910  -6.708 1.97e-11 ***
## X3                 0.809634   0.092694   8.734  < 2e-16 ***
## clusterCount       0.134362   0.012206  11.008  < 2e-16 ***
## age16TRUE          0.388600   0.052082   7.461 8.57e-14 ***
## X1                -0.468189   0.124969  -3.746 0.000179 ***
## dlcCount           0.015627   0.003354   4.660 3.17e-06 ***
## X2                 0.215163   0.086243   2.495 0.012601 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.2904) family taken to be 1)
## 
##     Null deviance: 14393.0  on 4475  degrees of freedom
## Residual deviance:  5023.8  on 4462  degrees of freedom
## AIC: 78491
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.2904 
##           Std. Err.:  0.0246 
## 
##  2 x log-likelihood:  -78460.6080
# Calculate and print AIC of the final model
final_aic <- AIC(fw_nb_regr)
print(paste("Final AIC:", final_aic))
## [1] "Final AIC: 78490.6076292886"

The log link function is used for the Negative binomial distribution, where y = exp(intercept + B1X + … + BnXn), the dispersion parameter is estimated to be 1.23, which indicates that the assumption of overdispersion is fulfilled and preferred over a Poisson GLM.

Almost all predictor variables are statistically significant (p-values < 0.001) based on their associated z-values and significance codes (***). This suggests that they have a meaningful relationship with the response variable.

Positive Coefficients: Variables such as estimatedSkew, logMedianPlaytime, languagesCount, positivePct, price, X5, clusterCount, age16, logDlcCount, and X2 are associated with an increase in the expected count of recommendations. For example: a 1-unit increase in clusterCount is associated with a increase of exp(0.108) = 1.114, or around 11.4% increase in the expected count of recommendations.

Negative Coefficients: Variables such as X1, X8, and X6 have negative coefficients, indicating a decrease in the expected count.

Null Deviance: The deviance of the model with only the intercept is 13581.6.

Residual Deviance: The deviance of the fitted model with predictors is 5029.9. A large reduction from the null deviance which indicates that the model explains a significant portion of the variability in the data.

Plotting the coefficients

plot_summs(fw_nb_regr)

Look at how different predictors is used estimate recommendations in the model (limited to below 25000)

effect_plot(fw_nb_regr, pred = estimatedSkew, interval = TRUE, plot.points = TRUE, 
            jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_path()`).

effect_plot(fw_nb_regr, pred = logMedianPlaytime, interval = TRUE, plot.points = TRUE, 
            jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_path()`).

effect_plot(fw_nb_regr, pred = positivePct, interval = TRUE, plot.points = TRUE, 
            jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).

effect_plot(fw_nb_regr, pred = X5, interval = TRUE, plot.points = TRUE, 
            jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 7 rows containing missing values or values outside the scale range
## (`geom_path()`).

effect_plot(fw_nb_regr, pred = X4, interval = TRUE, plot.points = TRUE, 
            jitter = 0.05) + ylim(0, 10000)
## Warning: Removed 566 rows containing missing values or values outside the scale range
## (`geom_point()`).

### Selected features

nb_selected_features <- names(coef(fw_nb_regr))[-1] # Exclude the intercept
print(nb_selected_features)
##  [1] "estimatedSkew"     "logMedianPlaytime" "languagesCount"   
##  [4] "positivePct"       "X5"                "price"            
##  [7] "X4"                "X3"                "clusterCount"     
## [10] "age16TRUE"         "X1"                "dlcCount"         
## [13] "X2"

Training final model on whole training partition

# Replace "age16TRUE" with "age16"
nb_selected_features <- gsub("age16TRUE", "age16", nb_selected_features)

# Prepare formula for the selected features
nb_selected_formula <- as.formula(paste("recommendations ~", paste(nb_selected_features, collapse = " + ")))

nb_regr.final <- glm.nb(nb_selected_formula, data = regr_task.train$data())

Test on unseen data

Testing predicted results against observed values on new data.

# Subset the test data task object to include only the selected features
regr_task.test_nb <- regr_task$clone()
regr_task.test_nb$filter(rows = regr_splits$test)
regr_task.test_nb$select(nb_selected_features)

nb_predicted.test <- predict(nb_regr.final, newdata = regr_task.test_nb$data(), type = "response")
nb_actual.test <- regr_task.test_nb$data()$recommendations

test_rmse <- sqrt(mean((nb_actual.test - nb_predicted.test)^2))
test_mae <- mean(abs(nb_actual.test - nb_predicted.test))

cat("Test RMSE:", test_rmse, "\n",
    "Test MAE:", test_mae)
## Test RMSE: 8437.562 
##  Test MAE: 3621.496
plot(nb_predicted.test, nb_actual.test, 
     main = "Actual vs. Predicted", 
     xlab = "Predicted Values", 
     ylab = "Actual Values",
     pch =20,
     cex = 0.8)
abline(0, 1)  # Line of perfect prediction

test_residuals <- (nb_actual.test - nb_predicted.test)

nb_pearson_residuals <- (nb_actual.test - nb_predicted.test) / sqrt(nb_predicted.test + nb_predicted.test^2 / nb_regr.final$theta)


hist(test_residuals, main = "Histogram of Test Residuals", xlab = "Residual", breaks = 100)

plot(x = nb_predicted.test, y = test_residuals, 
     main = "Residuals vs. Predicted Values", 
     xlab = "Predicted Values", 
     ylab = "Residuals",
     pch = 20,
     cex = 0.8)
abline(h = 0)

hist(nb_pearson_residuals, breaks = 100, main = "Histogram of Pearson Residuals",xlab = "Pearson Residuals")

plot(x = nb_predicted.test, y = nb_pearson_residuals, 
     main = "Pearson Residuals vs. Predicted Values", 
     xlab = "Predicted Values", 
     ylab = "Pearson Residuals",
     pch = 20,
     cex = 0.8)
abline(h = 0)

### Classification Task

For the classification task, the target would be the ordinal categorical priceRange variable, containing the category of price range based on what price it is currently sold in the Steam platform, where the range of prices are defined as: Low = 0-10, Medium = 11-20, and High = 21 or above.

table(games_df.filtered$priceRange)
## 
##    Low Medium   High 
##   2699   1823   1074

Since there are some class imbalance in the target variable, the training and testing partition would be split using a stratified sampling method, which ensures the training and testing partition will have a similar distribution as in the original data.

# all features used in classification tasks
classif.features <- left_join(games_df.filtered[,c('appID','priceRange','dlcCount', 'daysReleased', 'languagesCount', 'age16', 'industryLeader', 'selfPublished', 'aboveTwoHours', 'logMedianPlaytime', 'logRecommendations')], clusterWeight.wide, by = 'appID') 

classif.features <- left_join(classif.features, cluster.count, by = 'appID') %>% dplyr::select(-appID) 

# define task to store all features
classif_task = as_task_classif(classif.features,
                  target = "priceRange",
                  id = "classif task all")
# specify that the target and feature "age16" is stratified during split
classif_task$set_col_roles("priceRange", c("target", "stratum"))
classif_task$set_col_roles("age16", c("feature","stratum"))

# split train test partition
classif_splits <- partition(classif_task, ratio = 0.8)

print(classif_task)
## <TaskClassif:classif task all> (5596 x 18)
## * Target: priceRange
## * Properties: multiclass, strata
## * Features (17):
##   - dbl (14): X1, X2, X3, X4, X5, X6, X7, aboveTwoHours, clusterCount,
##     daysReleased, dlcCount, languagesCount, logMedianPlaytime,
##     logRecommendations
##   - lgl (3): age16, industryLeader, selfPublished
## * Strata: priceRange, age16

Baseline Classification model

# load featureless learner
lrn_featureless = lrn("classif.featureless")

lrn_featureless$train(classif_task, classif_splits$train)

prediction_bas = lrn_featureless$predict(classif_task, classif_splits$test)

prediction_bas$confusion 
##         truth
## response Low Medium High
##   Low    540    364  215
##   Medium   0      0    0
##   High     0      0    0
prediction_bas$score(msr("classif.ce"))
## classif.ce 
##  0.5174263

Class Weights to balance cost of Misclassification

Since the classes are imbalanced, I will define weights for different classes by assigning weights that are inversely proportional to the class frequencies. These weights will be used to weight the costs of misclassification of different classes to prevent bias in any of the classes.

class_counts = table(games_df.filtered$priceRange)

low_weight <- as.numeric(sum(class_counts) / (class_counts['Low'] * 3))
medium_weight <- as.numeric(sum(class_counts) / (class_counts['Medium'] * 3))
high_weight <-  as.numeric(sum(class_counts) / (class_counts['High'] * 3))

cat('Weights: \n', 'Low: ', low_weight, '\nMedium: ', medium_weight, '\nHigh: ', high_weight)
## Weights: 
##  Low:  0.6911202 
## Medium:  1.023222 
## High:  1.736809

Support Vector Machine

The Support Vector Machine (SVM) classifier identifies the optimal hyperplane (or decision boundary) that separates data points from different classes in high-dimensional space. This hyperplane is selected to maximize its distance from the nearest data points (support vectors), creating the widest possible margin. For predicting price ranges using game features and tag clusters, classes may not be linearly separable. The kernel trick addresses this by enlarging the feature space in a specific way, implicitly projecting data into a higher-dimensional space where separation is possible. The Support Vector Machine algorithm can then find the decision boundaries that maximize the margin between classes. Since I have three classes (Low, Medium, High), the algorithm would use a One vs One strategy by default, which would create 3(3-1)/2 = 3 binary classifiers: Low vs Medium, Low vs High, Medium vs High, and the final classification would be the class to which it was most frequently voted in these pairwise classifications.

Final tuned parameters result

# optimal hyperparameters from tuning
svm_ti$result$learner_param_vals
## [[1]]
## [[1]]$class.weights
##       Low    Medium      High 
## 0.6911202 1.0232218 1.7368094 
## 
## [[1]]$kernel
## [1] "radial"
## 
## [[1]]$type
## [1] "C-classification"
## 
## [[1]]$cost
## [1] 10
## 
## [[1]]$gamma
## [1] 0.005994843
autoplot(svm_ti, type = "surface")

Visualizing decision boundaries

Visualize the hyperplanes using two selected variables.

# Subset the task to include only the two variables and the target
classif_task.plot <- classif_task$clone()
classif_task.plot$select(c("logMedianPlaytime", "daysReleased"))

# create tuned model object
svm_lrn_plot <- lrn("classif.svm")
svm_lrn_plot$param_set$values = svm_ti$result_learner_param_vals

# train tuned model with subset features
svm_lrn_plot$train(classif_task.plot)
# Plot decision boundaries using autoplot

autoplot(svm_lrn_plot, type = "prediction", task = classif_task.plot)

### Training the optimized model on entire train data

# create tuned model
svm_tuned <- lrn("classif.svm", predict_type = "prob")
svm_tuned$param_set$values = svm_ti$result_learner_param_vals

# train tuned model with whole training partition
svm_tuned$train(classif_task, classif_splits$train)

Test on unseen data

svm_tuned_prediction <- svm_tuned$predict(classif_task, 
                                          row_ids = classif_splits$test)

# autoplot(svm_tuned_prediction)
svm_tuned_prediction$confusion
##         truth
## response Low Medium High
##   Low    444    141   27
##   Medium  83    174   76
##   High    13     49  112
svm_tuned_prediction$score(msrs(c("classif.ce", "classif.acc")))
##  classif.ce classif.acc 
##   0.3476318   0.6523682

Calculating precision and Recall manually:

svm_confusion <- svm_tuned_prediction$confusion

low_precision <- svm_confusion[1,1] / sum(svm_confusion[1,1:3])
medium_precision <- svm_confusion[2,2] / sum(svm_confusion[2,1:3])
high_precision <- svm_confusion[3,3] / sum(svm_confusion[3,1:3])

low_recall <- svm_confusion[1,1] / sum(svm_confusion[1:3,1])
medium_recall <- svm_confusion[2,2] / sum(svm_confusion[1:3,2])
high_recall <- svm_confusion[3,3] / sum(svm_confusion[1:3,3])

# macro average 
macro_precision <- mean(c(low_precision, medium_precision, high_precision))
macro_recall <- mean(c(low_recall, medium_recall, high_recall))

cat('Precision scores: \n',
    paste('Low:', round(low_precision,2)),'\n',
    paste('Medium:', round(medium_precision,2)),'\n',
    paste('High:', round(high_precision,2)),'\n',
    paste('Average Precision:', round(macro_precision,2)),'\n\n')
## Precision scores: 
##  Low: 0.73 
##  Medium: 0.52 
##  High: 0.64 
##  Average Precision: 0.63
cat('Recall scores: \n',
    paste('Low:', round(low_recall,2)),'\n',
    paste('Medium: ', round(medium_recall,2)),'\n',
    paste('High:', round(high_recall,2)),'\n',
    paste('Average Recall:', round(macro_recall,2)))
## Recall scores: 
##  Low: 0.82 
##  Medium:  0.48 
##  High: 0.52 
##  Average Recall: 0.61

Results have shown that this model performed well in predicting Low and High classes, but suffers when predicting the Medium class. This is expected, as games priced between 11 and 20 often exhibit characteristics of both expensive and inexpensive games, making predictions for this class more challenging.

Random Forest Classification

The final optimized parameters are determined based on the classification logloss or the cross-entropy measure, which quantifies the difference between predicted probabilities and actual outcomes. This measure is chosen because it accounts for how confident the mistakes are, making it suitable for data that has class imbalance.

Optimized tuning parameters:

rpart_classif_ti$result_learner_param_vals
## $class.weights
##       Low    Medium      High 
## 0.6911202 1.0232218 1.7368094 
## 
## $importance
## [1] "impurity"
## 
## $num.threads
## [1] 1
## 
## $max.depth
## [1] 30
## 
## $min.node.size
## [1] 3
## 
## $num.trees
## [1] 500

Training optimized Random Forest model on whole training partition

rforest_classif_tuned <- lrn("classif.ranger")

# train model with tuned hyperparameters
rforest_classif_tuned$param_set$values = rpart_classif_ti$result_learner_param_vals

rforest_classif_tuned$train(classif_task, classif_splits$train)

Feature importance

Ranking the feature’s importance based on the decrease in node impurity when a feature is used to split a node.

# Extract importance and convert to data.frame
rpart_importance <- data.frame(
  feature = names(rforest_classif_tuned$importance()),
  importance = unname(rforest_classif_tuned$importance())
)

# Sort features by importance (descending)
rpart_importance <- rpart_importance %>%
  arrange(desc(importance))
print(rpart_importance)
##               feature importance
## 1   logMedianPlaytime  367.78669
## 2        daysReleased  360.77438
## 3  logRecommendations  338.64965
## 4                  X4  261.19174
## 5                  X3  195.64741
## 6                  X5  183.81999
## 7      languagesCount  174.38276
## 8                  X1  167.31301
## 9                  X7  163.57605
## 10                 X6  145.16650
## 11                 X2  140.89194
## 12           dlcCount  136.30984
## 13       clusterCount   74.31522
## 14     industryLeader   57.80119
## 15      selfPublished   39.71734
## 16      aboveTwoHours   21.12932
## 17              age16   20.30442

Plot importance

ggplot(rpart_importance, aes(x = reorder(feature, importance), y = importance)) +
  geom_col(fill = "steelblue") +  # Bar color
  coord_flip() +  # Horizontal bars for readability
  labs(
    title = "Feature Importance in Random Forest",
    x = "Feature",
    y = "Importance Score"
  ) +
  theme_minimal()

Visualize top 5 features with decision tree

classif_rpart_lrn <- lrn("classif.rpart")

classif_rpart_subset <- classif_task$clone()

#look at how top 5 features from rforest would be plotted
classif_rpart_subset$select(names(head(rforest_classif_tuned$importance(),5)))

classif_rpart_lrn$train(classif_rpart_subset, row_ids = classif_splits$train)
suppressWarnings(
rpart.plot(classif_rpart_lrn$model)
)

### Test results

rforest_classif_prediction <- rforest_classif_tuned$predict(classif_task, classif_splits$test)

rforest_classif_prediction$confusion
##         truth
## response Low Medium High
##   Low    448    142   18
##   Medium  76    169   80
##   High    16     53  117
rforest_classif_prediction$score(msrs(c("classif.ce", "classif.acc")))
##  classif.ce classif.acc 
##   0.3440572   0.6559428

Visualize prediction using important features

# Get the test data from the task
test_data <- classif_task$data(rows = classif_splits$test)

# get predictions as df
predictions_df <- data.frame(
  truth = rforest_classif_prediction$truth,
  response = rforest_classif_prediction$response
)

# Combine test data with predictions
combined_data <- dplyr::bind_cols(test_data, predictions_df)


ggplot(combined_data, aes(x = daysReleased, y = logMedianPlaytime, color = response)) +
  geom_point() +
  labs(color = "Predicted Price Range")

ggplot(combined_data, aes(x = logRecommendations, y = logMedianPlaytime, color = response)) +
  geom_point() +
  labs(color = "Predicted Price Range") 

Precision and Recall scores for each class

rforest_confusion <- rforest_classif_prediction$confusion

low_precision <- rforest_confusion[1,1] / sum(rforest_confusion[1,1:3])
medium_precision <- rforest_confusion[2,2] / sum(rforest_confusion[2,1:3])
high_precision <- rforest_confusion[3,3] / sum(rforest_confusion[3,1:3])

low_recall <- rforest_confusion[1,1] / sum(rforest_confusion[1:3,1])
medium_recall <- rforest_confusion[2,2] / sum(rforest_confusion[1:3,2])
high_recall <- rforest_confusion[3,3] / sum(rforest_confusion[1:3,3])

# macro average 
macro_precision <- mean(c(low_precision, medium_precision, high_precision))
macro_recall <- mean(c(low_recall, medium_recall, high_recall))

cat('Precision scores: \n'
  ,paste('Low:', round(low_precision,2)),'\n'
  ,paste('Medium:', round(medium_precision,2)),'\n'
  ,paste('High:', round(high_precision,2)),'\n'
  ,paste('Average Precision:', round(macro_precision,2)),'\n\n')
## Precision scores: 
##  Low: 0.74 
##  Medium: 0.52 
##  High: 0.63 
##  Average Precision: 0.63
cat('Recall scores: \n'
  ,paste('Low:', round(low_recall,2)),'\n'
  ,paste('Medium: ', round(medium_recall,2)),'\n'
  ,paste('High:', round(high_recall,2)),'\n'
  ,paste('Average Recall:', round(macro_recall,2)))
## Recall scores: 
##  Low: 0.83 
##  Medium:  0.46 
##  High: 0.54 
##  Average Recall: 0.61